library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(magrittr)
library(XML) 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
library(stringi)
library(rvest) 
## Loading required package: xml2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
## 
##     xml
library(ggplot2)
library(RCurl) 
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(httr)
## 
## Attaching package: 'httr'
## The following object is masked from 'package:plotly':
## 
##     config
## The following object is masked from 'package:caret':
## 
##     progress
library(jsonlite)
library(e1071)
library(naniar)
library(scales)
library(gghighlight)
library(stringr)
library(knitr)
knitr::opts_chunk$set(error = TRUE)
beers=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Beers.csv',header=T)
brewery=read.csv('/Users/renfengwang/Documents/SMU\ Data\ Science\ Program/Doing\ Data\ Science/Project\ 1/Breweries.csv',header=T)
head(beers)
##                  Name Beer_ID   ABV IBU Brewery_id
## 1            Pub Beer    1436 0.050  NA        409
## 2         Devil's Cup    2265 0.066  NA        178
## 3 Rise of the Phoenix    2264 0.071  NA        178
## 4            Sinister    2263 0.090  NA        178
## 5       Sex and Candy    2262 0.075  NA        178
## 6        Black Exodus    2261 0.077  NA        178
##                            Style Ounces
## 1            American Pale Lager     12
## 2        American Pale Ale (APA)     12
## 3                   American IPA     12
## 4 American Double / Imperial IPA     12
## 5                   American IPA     12
## 6                  Oatmeal Stout     12
head(brewery)
##   Brew_ID                      Name          City State
## 1       1        NorthGate Brewing    Minneapolis    MN
## 2       2 Against the Grain Brewery    Louisville    KY
## 3       3  Jack's Abby Craft Lagers    Framingham    MA
## 4       4 Mike Hess Brewing Company     San Diego    CA
## 5       5   Fort Point Beer Company San Francisco    CA
## 6       6     COAST Brewing Company    Charleston    SC
#Question 1
brewery %>% arrange(State) %>% ggplot(aes(x=State),count=Name)+geom_bar()+geom_text(aes(label=..count..),stat='count',vjust=-.5) +
  xlab('States')+ ylab('Brewery Numbers') + ggtitle('Numbers of Brewery by State')  #Bar plot to count the brewery numbers in each state

#Question 2
colnames(beers)[colnames(beers)=='Brewery_id']='Brew_ID'  #Change one of the data frame column names before merging two data sets.
df_beer=merge(beers, brewery, by='Brew_ID', all=T)  #Outer join two data frames.
colnames(df_beer)[colnames(df_beer)=='Name.x']='Beer_Name'
colnames(df_beer)[colnames(df_beer)=='Name.y']='Brewery_Name'

head(df_beer) 
##   Brew_ID     Beer_Name Beer_ID   ABV IBU                               Style
## 1       1  Get Together    2692 0.045  50                        American IPA
## 2       1 Maggie's Leap    2691 0.049  26                  Milk / Sweet Stout
## 3       1    Wall's End    2690 0.048  19                   English Brown Ale
## 4       1       Pumpion    2689 0.060  38                         Pumpkin Ale
## 5       1    Stronghold    2688 0.060  25                     American Porter
## 6       1   Parapet ESB    2687 0.056  47 Extra Special / Strong Bitter (ESB)
##   Ounces       Brewery_Name        City State
## 1     16 NorthGate Brewing  Minneapolis    MN
## 2     16 NorthGate Brewing  Minneapolis    MN
## 3     16 NorthGate Brewing  Minneapolis    MN
## 4     16 NorthGate Brewing  Minneapolis    MN
## 5     16 NorthGate Brewing  Minneapolis    MN
## 6     16 NorthGate Brewing  Minneapolis    MN
tail(df_beer)
##      Brew_ID                 Beer_Name Beer_ID   ABV IBU
## 2405     556             Pilsner Ukiah      98 0.055  NA
## 2406     557  Heinnieweisse Weissebier      52 0.049  NA
## 2407     557           Snapperhead IPA      51 0.068  NA
## 2408     557         Moo Thunder Stout      50 0.049  NA
## 2409     557         Porkslap Pale Ale      49 0.043  NA
## 2410     558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                  Brewery_Name          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK
#Question 3
df_beerall=df_beer %>% replace_with_na_all(condition = ~.x=='')
df_beerall=df_beerall %>% replace_with_na_all(condition = ~.x==' ')
gg_miss_var(df_beerall)

table(is.na(df_beerall$IBU))
## 
## FALSE  TRUE 
##  1405  1005
table(is.na(df_beerall$ABV))
## 
## FALSE  TRUE 
##  2348    62

We can see there are missing values in both ABV and IBU columns. In ABV column, there are 62 missing values. In IBU column, there are 1005 missing values.

#Question 4
Median_ABV_IBU=df_beerall %>% arrange(State) %>% group_by(State) %>% summarize(Median_ABV=median(ABV, na.rm=TRUE), Median_IBU=median(IBU,na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_ABV,width=.5))+geom_bar(stat='identity') + 
  geom_text(aes(label=percent(Median_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
  xlab('States') +ylab('Median Alcoholic Content')+ggtitle('Median Alcoholic Content by State')

Median_ABV_IBU %>% ggplot(aes(x=State, y=Median_IBU,width=.5))+geom_bar(stat='identity') + 
  geom_text(aes(label=Median_IBU),vjust=-.5,size=2.5,check_overlap = T) +
  xlab('States') +ylab('Median International Bitterness')+ggtitle('Median International Bitterness by State') #Removed state=SD as NA
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 1 rows containing missing values (geom_text).

State South Dakota doesn’t have IBU values, but has ABV values.

#Question 5
Max_ABV_IBU=df_beerall %>% arrange(State) %>% group_by(State) %>% summarize(Max_ABV=max(ABV, na.rm=TRUE), Max_IBU=max(IBU,na.rm=T))
## Warning in max(IBU, na.rm = T): no non-missing arguments to max; returning -Inf
## `summarise()` ungrouping output (override with `.groups` argument)
Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_ABV,width=.5))+geom_bar(stat='identity') + 
  geom_text(aes(label=percent(Max_ABV, accuracy = 0.01)),vjust=-.5,size=2.5,check_overlap = T) +
  xlab('States') +ylab('Max Alcoholic Content')+ggtitle('Max Alcoholic Content by State')

Max_ABV_IBU %>% ggplot(aes(x=State, y=Max_IBU,width=.5))+geom_bar(stat='identity') + 
  geom_text(aes(label=Max_IBU),vjust=-.5,size=2.5,check_overlap = T) +
  xlab('States') +ylab('Max International Bitterness')+ggtitle('Max International Bitterness by State')
## Warning: Removed 1 rows containing missing values (geom_bar).

We can see Colorado has the maximum Alcoholic beer with ABV 12.8% and Oregon has the most bitter beer with IBU 138.

#Question 6
df_beerall%>% summarize(Mean=mean(ABV, na.rm=TRUE),
                                             Median=median(ABV,na.rm=T),
                                             Min=min(ABV,na.rm=T),
                                             Max=max(ABV,na.rm=T),
                                             SD=sd(ABV,na.rm=T),
                                             N=n())
## # A tibble: 1 x 6
##     Mean Median   Min   Max     SD     N
##    <dbl>  <dbl> <dbl> <dbl>  <dbl> <int>
## 1 0.0598  0.056 0.001 0.128 0.0135  2410
df_beer %>% filter(!is.na(ABV)) %>% ggplot(aes(x=ABV))+geom_histogram(aes(y=..density..),colour='black',fill='white')+
  geom_density(alpha=.5, fill='#FF6666')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The distribution of ABV is right skewed. ABV of beers around 5% has the most counts. There are total 2410 non-missing ABV values in this data set. The maximum ABV is 12.8%, the minimum ABV is .1%, the median ABV is 5.6%. The mean ABV is 5.98% and standard deviation of ABV is 1.35%.

#Question 7
df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>% 
  ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth(method=loess)
## `geom_smooth()` using formula 'y ~ x'

Most beers with lower IBU (less than 50) have ABV values around 5%. When IBU value increases, ABV values spreads out. But most beers with IBU values above 50, their ABV values spread out within the region between 5% and 10%.

#Question 8  KNN
df_beer_IPA=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>% 
  filter(str_detect(Style, regex(str_c('\\b','IPA','\\b',sep=''), ignore_case = T)))

df_beer_IPA$Style=as.factor('IPA')

df_beer_Ale=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>% 
  filter(str_detect(Style, regex(str_c('\\b','Ale','\\b',sep=''), ignore_case = T)))

df_beer_Ale$Style=as.factor('Ale')

df_beer_test=rbind(df_beer_IPA, df_beer_Ale)

iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
beer_train = df_beer_test[trainIndices,]
beer_test = df_beer_test[-trainIndices,]
classifications=knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = 5)
CM = confusionMatrix(table(classifications,beer_test$Style))
classifications
##   [1] IPA Ale Ale Ale IPA Ale IPA IPA IPA Ale IPA IPA Ale IPA IPA IPA IPA IPA
##  [19] Ale IPA IPA IPA IPA Ale Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA
##  [37] IPA IPA IPA IPA IPA IPA IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA IPA
##  [55] IPA Ale IPA IPA Ale IPA IPA IPA Ale IPA IPA IPA IPA IPA IPA IPA IPA Ale
##  [73] IPA IPA IPA IPA IPA Ale IPA IPA Ale IPA Ale IPA IPA IPA IPA Ale Ale IPA
##  [91] IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA IPA Ale
## [109] IPA IPA IPA IPA Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale
## [127] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale
## [145] Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale IPA Ale IPA Ale Ale Ale
## [163] IPA Ale Ale IPA Ale Ale Ale Ale Ale IPA IPA Ale Ale IPA Ale Ale Ale Ale
## [181] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale
## [199] IPA Ale IPA Ale IPA Ale IPA Ale IPA Ale Ale Ale Ale Ale Ale IPA IPA Ale
## [217] IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale Ale Ale Ale
## [235] IPA Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale Ale
## [253] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA Ale Ale Ale Ale Ale Ale
## [271] Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale Ale IPA IPA Ale
## attr(,"prob")
##   [1] 0.6000000 0.7500000 0.8000000 1.0000000 1.0000000 0.6666667 1.0000000
##   [8] 1.0000000 0.6666667 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
##  [15] 1.0000000 0.8571429 1.0000000 1.0000000 0.5000000 0.6666667 1.0000000
##  [22] 1.0000000 0.6000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [29] 0.9166667 1.0000000 0.8000000 0.8333333 0.8333333 1.0000000 1.0000000
##  [36] 1.0000000 1.0000000 1.0000000 1.0000000 0.9000000 0.7142857 1.0000000
##  [43] 1.0000000 0.8000000 1.0000000 0.8571429 0.7142857 1.0000000 1.0000000
##  [50] 1.0000000 0.8333333 0.8571429 0.8571429 0.8571429 1.0000000 1.0000000
##  [57] 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000 0.8000000 1.0000000
##  [64] 0.8000000 0.8571429 0.7777778 0.8333333 1.0000000 1.0000000 0.8888889
##  [71] 0.8333333 0.8571429 0.7777778 1.0000000 0.8000000 0.8888889 0.6000000
##  [78] 0.6666667 0.8571429 0.8000000 0.8333333 1.0000000 0.8000000 1.0000000
##  [85] 0.6666667 0.8888889 1.0000000 0.8000000 0.6000000 0.6000000 1.0000000
##  [92] 1.0000000 0.9000000 0.8333333 0.8333333 1.0000000 1.0000000 1.0000000
##  [99] 0.9000000 0.8888889 0.8888889 1.0000000 0.6000000 0.8571429 0.7500000
## [106] 1.0000000 1.0000000 0.8888889 0.8750000 0.8571429 0.8333333 1.0000000
## [113] 0.7500000 0.7500000 0.5555556 1.0000000 1.0000000 0.8888889 1.0000000
## [120] 1.0000000 0.5000000 1.0000000 1.0000000 1.0000000 0.8571429 0.5000000
## [127] 0.8333333 0.6000000 1.0000000 0.8750000 1.0000000 1.0000000 0.8333333
## [134] 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000 0.7777778
## [141] 1.0000000 1.0000000 1.0000000 0.9000000 1.0000000 1.0000000 1.0000000
## [148] 1.0000000 0.7142857 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
## [155] 1.0000000 1.0000000 0.6666667 0.8571429 0.8571429 0.6000000 1.0000000
## [162] 0.8000000 0.5714286 0.8000000 0.8000000 0.6000000 0.7500000 0.8000000
## [169] 1.0000000 0.8000000 1.0000000 0.6000000 1.0000000 1.0000000 1.0000000
## [176] 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8571429
## [183] 1.0000000 1.0000000 0.7500000 0.7500000 0.7500000 1.0000000 1.0000000
## [190] 1.0000000 0.6000000 0.5000000 0.8571429 0.8000000 1.0000000 0.8333333
## [197] 1.0000000 0.8000000 0.6000000 1.0000000 0.5714286 1.0000000 1.0000000
## [204] 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000
## [211] 1.0000000 1.0000000 0.8571429 0.8571429 0.6666667 1.0000000 0.7142857
## [218] 1.0000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
## [225] 1.0000000 1.0000000 1.0000000 1.0000000 0.6666667 1.0000000 1.0000000
## [232] 1.0000000 1.0000000 0.8000000 1.0000000 0.8000000 1.0000000 1.0000000
## [239] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [246] 1.0000000 0.6000000 1.0000000 0.8333333 0.5555556 0.6666667 1.0000000
## [253] 1.0000000 0.5555556 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [260] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [267] 1.0000000 1.0000000 1.0000000 1.0000000 0.5555556 1.0000000 1.0000000
## [274] 1.0000000 1.0000000 1.0000000 1.0000000 0.7500000 1.0000000 1.0000000
## [281] 1.0000000 1.0000000 1.0000000 0.8333333 1.0000000
## Levels: IPA Ale
CM
## Confusion Matrix and Statistics
## 
##                
## classifications IPA Ale
##             IPA  92  26
##             Ale  23 144
##                                         
##                Accuracy : 0.8281        
##                  95% CI : (0.7792, 0.87)
##     No Information Rate : 0.5965        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.6443        
##                                         
##  Mcnemar's Test P-Value : 0.7751        
##                                         
##             Sensitivity : 0.8000        
##             Specificity : 0.8471        
##          Pos Pred Value : 0.7797        
##          Neg Pred Value : 0.8623        
##              Prevalence : 0.4035        
##          Detection Rate : 0.3228        
##    Detection Prevalence : 0.4140        
##       Balanced Accuracy : 0.8235        
##                                         
##        'Positive' Class : IPA           
## 

I randomly assigned 70% of original data to training data and 30% of the original data as testing data when seed value is set to 6. We can tell when k=5, the accuracy is about 82.81%.

for(j in 1:iterations)
{
  accs = data.frame(accuracy = numeric(30), k = numeric(30))
  trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
  beer_train = df_beer_test[trainIndices,]
  beer_test = df_beer_test[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(beer_train[,c(4,5)],beer_test[,c(4,5)],beer_train$Style, prob = TRUE, k = i)
    table(classifications,beer_test$Style)
    CM = confusionMatrix(table(classifications,beer_test$Style))
    masterAcc[j,i] = CM$overall[1]
  }
}
MeanAcc = colMeans(masterAcc)
p=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(p)

I shuffled the training and testing data 500 times by 70%/30% split. And assigned integer k value from 1 to 30. From the plot, we can tell when k=5, it gives us the highest accuracy 85% which means we can predict the beer is either India Pale Ales or any other types of Ale by knowing its ABV and IBU values with 85% accuracy when sets nearest neighbor numbers equals to 5

#Naive Bayes--------------

iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7 
set.seed(6)
for(j in 1:iterations)
{
  
  trainIndices = sample(1:dim(df_beer_test)[1],round(splitPerc * dim(df_beer_test)[1]))
  beer_train = df_beer_test[trainIndices,]
  beer_test = df_beer_test[-trainIndices,]
  model = naiveBayes(beer_train[,c(4,5)],as.factor(beer_train$Style),laplace = 1)
  table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style))
  CM = confusionMatrix(table(predict(model,beer_test[,c(4,5)]),as.factor(beer_test$Style)))
  masterAcc[j] = CM$overall[1]
  masterSen[j] = CM$byClass[1]
  masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.8432491
MeanSen
## [1] 0.808263
MeanSpec
## [1] 0.8679429

Now I chose to use Naive Bayes model as I want to compare the accuracy with KNN model. I shuffled the training and testing data 500 times by 70%/30% split like I did in KNN model. The mean accuracy of Naive Bayes is 84% which is similar to what we got from KNN.

#Question 9 Sorted Lager, Stout, IPA
df_beer_stout=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>% 
  filter(str_detect(Style, regex(str_c('\\b','stout','\\b',sep=''), ignore_case = T)))

df_beer_lager=df_beer %>% filter(!is.na(ABV) &!is.na(IBU)) %>% 
  filter(str_detect(Style, regex(str_c('\\b','lager','\\b',sep=''), ignore_case = T)))

df_beer_stout$Style=as.factor('Stout')
df_beer_lager$Style=as.factor('Lager')

df_beer_sort=rbind(df_beer_IPA, df_beer_stout)
df_beer_sort=rbind(df_beer_sort, df_beer_lager)

iterations = 500
masterAcc = matrix(nrow = iterations)
masterSen = matrix(nrow = iterations)
masterSpec = matrix(nrow = iterations)
splitPerc = .7 
set.seed(6)
for(j in 1:iterations)
{
  
  trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
  beer_sort_train = df_beer_sort[trainIndices,]
  beer_sort_test = df_beer_sort[-trainIndices,]
  model = naiveBayes(beer_sort_train[,c(4,5)],as.factor(beer_sort_train$Style),laplace = 1)
  table(predict(model,beer_sort_test[,c(4,5)]),as.factor(beer_sort_test$Style))
  CM = confusionMatrix(table(predict(model,beer_sort_test[,c(4,5)]),as.factor(beer_sort_test$Style)))
  masterAcc[j] = CM$overall[1]
  masterSen[j] = CM$byClass[1]
  masterSpec[j] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanSen = colMeans(masterSen)
MeanSpec = colMeans(masterSpec)
MeanAcc
## [1] 0.8469811
MeanSen
## [1] 0.9374376
MeanSpec
## [1] 0.01695888

Now I want to use IBU and ABV values to predict non-IPA Ales, lager and stout these three beer styles. I shuffled the training and testing data 500 times by 70%/30% split like I did before. First I used Naive Bayes method. The mean accuracy is about 84.5%.

iterations = 500
numks = 30
splitPerc = .7
masterAcc = matrix(nrow = iterations, ncol = numks)
set.seed(6)
for(j in 1:iterations)
{
  accs = data.frame(accuracy = numeric(30), k = numeric(30))
  trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
  beer_sort_train = df_beer_sort[trainIndices,]
  beer_sort_test = df_beer_sort[-trainIndices,]
  for(i in 1:numks)
  {
    classifications = knn(beer_sort_train[,c(4,5)],beer_sort_test[,c(4,5)],beer_sort_train$Style, prob = TRUE, k = i)
    table(classifications,beer_sort_test$Style)
    CM = confusionMatrix(table(classifications,beer_sort_test$Style))
    masterAcc[j,i] = CM$overall[1]
  }
}
MeanAcc = colMeans(masterAcc)
e=ggplot(mapping=aes(x=seq(1,numks,1), y=MeanAcc))+geom_line()
ggplotly(e)
MeanAcc
##  [1] 0.8788553 0.8594088 0.8674843 0.8640252 0.8640503 0.8643648 0.8668805
##  [8] 0.8694969 0.8716478 0.8732704 0.8740881 0.8745031 0.8752075 0.8752075
## [15] 0.8756855 0.8758365 0.8760377 0.8760881 0.8765660 0.8767925 0.8770314
## [22] 0.8770943 0.8771321 0.8769937 0.8768050 0.8768302 0.8768050 0.8766541
## [29] 0.8768428 0.8769057

Now I used KNN method. The highest mean accuracy is 87.9% when k=1.We can see from the plot that accuracy dropped when k=2, but increased when k>=6. Then the accuracy keeps arising.

set.seed(4)
trainIndices = sample(1:dim(df_beer_sort)[1],round(splitPerc * dim(df_beer_sort)[1]))
beer_sort_train = df_beer_sort[trainIndices,]
beer_sort_test = df_beer_sort[-trainIndices,]
classifications_knn=knn(beer_sort_train[,c(4,5)],beer_sort_test[,c(4,5)],beer_sort_train$Style, prob = TRUE, k = 5)
CM_knn = confusionMatrix(table(classifications,beer_sort_test$Style))
classifications_knn
##   [1] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   Stout IPA   IPA   IPA  
##  [13] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
##  [25] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
##  [37] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
##  [49] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
##  [61] IPA   Lager Stout IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
##  [73] IPA   IPA   IPA   IPA   IPA   IPA   Stout IPA   IPA   IPA   Lager IPA  
##  [85] Stout IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
##  [97] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   Lager IPA   IPA   IPA  
## [109] IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA   IPA  
## [121] IPA   IPA   IPA   IPA   IPA   Stout IPA   Lager Stout IPA   Lager IPA  
## [133] IPA   Lager IPA   Lager Lager Lager Lager Lager Lager Lager Lager Lager
## [145] Lager Lager Lager Lager Lager IPA   Lager Lager Lager Lager Lager Lager
## [157] Lager Lager Lager
## attr(,"prob")
##   [1] 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333 0.6000000 0.8000000
##   [8] 1.0000000 0.6000000 0.7142857 1.0000000 1.0000000 0.8888889 0.8888889
##  [15] 0.8888889 1.0000000 1.0000000 0.8000000 1.0000000 1.0000000 0.6666667
##  [22] 1.0000000 1.0000000 1.0000000 1.0000000 0.8888889 1.0000000 1.0000000
##  [29] 1.0000000 1.0000000 1.0000000 1.0000000 0.7500000 1.0000000 1.0000000
##  [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8000000 0.8750000
##  [43] 1.0000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [50] 1.0000000 1.0000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000
##  [57] 1.0000000 1.0000000 1.0000000 0.6250000 1.0000000 0.5000000 0.5714286
##  [64] 1.0000000 0.5555556 1.0000000 0.8888889 0.8750000 1.0000000 1.0000000
##  [71] 0.8333333 0.8000000 0.8000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [78] 0.8333333 0.4000000 1.0000000 1.0000000 1.0000000 0.6000000 1.0000000
##  [85] 0.5000000 0.8333333 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
##  [92] 0.5000000 1.0000000 1.0000000 0.8333333 1.0000000 1.0000000 1.0000000
##  [99] 1.0000000 1.0000000 1.0000000 0.8333333 0.8000000 1.0000000 0.6000000
## [106] 1.0000000 0.8333333 0.8000000 0.8000000 0.6666667 0.6666667 0.8888889
## [113] 1.0000000 1.0000000 1.0000000 1.0000000 0.4000000 1.0000000 1.0000000
## [120] 1.0000000 1.0000000 0.8333333 0.8000000 0.8571429 0.7142857 0.5000000
## [127] 0.7500000 0.7142857 0.4000000 0.8888889 0.6666667 0.5000000 0.4000000
## [134] 1.0000000 0.6000000 0.6666667 1.0000000 0.6666667 0.6666667 1.0000000
## [141] 0.5714286 0.6000000 1.0000000 1.0000000 1.0000000 0.8000000 1.0000000
## [148] 1.0000000 1.0000000 0.6250000 0.8000000 1.0000000 0.6666667 1.0000000
## [155] 1.0000000 1.0000000 0.6666667 1.0000000 0.8000000
## Levels: IPA Stout Lager
CM_knn
## Confusion Matrix and Statistics
## 
##                
## classifications IPA Stout Lager
##           IPA   120     8     5
##           Stout   0     0     0
##           Lager   4     4    18
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8679          
##                  95% CI : (0.8052, 0.9163)
##     No Information Rate : 0.7799          
##     P-Value [Acc > NIR] : 0.003347        
##                                           
##                   Kappa : 0.5924          
##                                           
##  Mcnemar's Test P-Value : 0.007012        
## 
## Statistics by Class:
## 
##                      Class: IPA Class: Stout Class: Lager
## Sensitivity              0.9677      0.00000       0.7826
## Specificity              0.6286      1.00000       0.9412
## Pos Pred Value           0.9023          NaN       0.6923
## Neg Pred Value           0.8462      0.92453       0.9624
## Prevalence               0.7799      0.07547       0.1447
## Detection Rate           0.7547      0.00000       0.1132
## Detection Prevalence     0.8365      0.00000       0.1635
## Balanced Accuracy        0.7982      0.50000       0.8619

Here is one of the case I shuffled the testing and training data once and the accuracy we got is 85% when k=5

#Back to Question 7
df_beer_study=rbind(df_beer_sort, df_beer_Ale)
df_beer_study %>% ggplot(aes(y=ABV, x=IBU))+geom_point(position='jitter')+geom_smooth() +facet_wrap(~Style)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Now we came back to Question 7 by checking four different styles of beer. We can see, majority of lager style beer has low IBU and its ABV values are around 5%. No ABV values of lager beer are above 7.5%. ABV values of Indian Pale Ale style seems like increases when IBU increases, but their ABV values don’t pass 10% and majority of their ABV values are between 5% to 10%. The other types of Ale beer have low IBU values which most of them are below 50. Also, most of their ABV values are between 3.75% to 10%. The data size of stout style beer is small, but we can roughly tell their ABV increases when IBU increases.